1. Data Exploration¶

In [3]:
#import pnadas and numpy
import pandas as pd
import numpy as np
In [6]:
#Load the dataset into dataframe
df_train = pd.read_csv('../data/raw/2022_train.csv')
df_test = pd.read_csv('../data/raw/2022_test.csv')
In [7]:
#Get the header of train dataframe
df_train.head()
Out[7]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% ... FTA FT% OREB DREB REB AST STL BLK TOV TARGET_5Yrs
0 3799 80 24.3 7.8 3.0 6.4 45.7 0.1 0.3 22.6 ... 2.9 72.1 2.2 2.0 3.8 3.2 1.1 0.2 1.6 1
1 3800 75 21.8 10.5 4.2 7.9 55.1 -0.3 -1.0 34.9 ... 3.6 67.8 3.6 3.7 6.6 0.7 0.5 0.6 1.4 1
2 3801 85 19.1 4.5 1.9 4.5 42.8 0.4 1.2 34.3 ... 0.6 75.7 0.6 1.8 2.4 0.8 0.4 0.2 0.6 1
3 3802 63 19.1 8.2 3.5 6.7 52.5 0.3 0.8 23.7 ... 1.5 66.9 0.8 2.0 3.0 1.8 0.4 0.1 1.9 1
4 3803 63 17.8 3.7 1.7 3.4 50.8 0.5 1.4 13.7 ... 0.5 54.0 2.4 2.7 4.9 0.4 0.4 0.6 0.7 1

5 rows × 21 columns

In [9]:
#Get the header of test dataframe
df_test.head()
Out[9]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% FTM FTA FT% OREB DREB REB AST STL BLK TOV
0 0 56 9.1 4.0 1.6 3.7 43.7 0.1 0.3 7.3 0.7 1.2 63.4 1.2 0.8 1.7 0.4 0.2 0.3 0.8
1 1 43 19.3 10.1 3.7 8.1 46.0 0.6 1.7 35.1 1.8 2.5 75.3 0.5 0.9 1.5 3.5 0.6 0.0 1.8
2 2 82 33.9 11.3 4.9 10.6 45.6 0.5 1.9 44.8 1.8 2.7 71.2 1.3 3.3 4.5 2.5 1.3 0.3 2.0
3 3 86 44.7 18.8 6.8 15.9 42.9 0.5 1.8 13.5 4.5 6.3 70.9 1.5 3.2 5.0 4.1 0.9 0.1 3.6
4 4 58 12.3 4.7 1.6 4.0 40.0 0.5 1.7 38.7 1.1 1.3 76.9 0.2 0.6 0.9 1.5 0.5 -0.4 0.9
In [10]:
#Get the shape of train dataframe
df_train.shape
Out[10]:
(8000, 21)
In [11]:
#Get the shape of test dataframe
df_test.shape
Out[11]:
(3799, 20)
In [12]:
#Get the information of train dataframe
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV          8000 non-null   float64
 20  TARGET_5Yrs  8000 non-null   int64  
dtypes: float64(18), int64(3)
memory usage: 1.3 MB
In [13]:
#Get the information of test dataframe
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Data columns (total 20 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id       3799 non-null   int64  
 1   GP       3799 non-null   int64  
 2   MIN      3799 non-null   float64
 3   PTS      3799 non-null   float64
 4   FGM      3799 non-null   float64
 5   FGA      3799 non-null   float64
 6   FG%      3799 non-null   float64
 7   3P Made  3799 non-null   float64
 8   3PA      3799 non-null   float64
 9   3P%      3799 non-null   float64
 10  FTM      3799 non-null   float64
 11  FTA      3799 non-null   float64
 12  FT%      3799 non-null   float64
 13  OREB     3799 non-null   float64
 14  DREB     3799 non-null   float64
 15  REB      3799 non-null   float64
 16  AST      3799 non-null   float64
 17  STL      3799 non-null   float64
 18  BLK      3799 non-null   float64
 19  TOV      3799 non-null   float64
dtypes: float64(18), int64(2)
memory usage: 593.7 KB
In [14]:
#Get the descriptive stats of train dataframe
df_train.describe()
Out[14]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% ... FTA FT% OREB DREB REB AST STL BLK TOV TARGET_5Yrs
count 8000.00000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 ... 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000 8000.000000
mean 7798.50000 62.777875 18.576662 7.267088 2.807037 6.231212 44.608900 0.264525 0.816562 19.583700 ... 1.947788 71.365825 1.077838 2.168500 3.245300 1.624513 0.648687 0.245212 1.257763 0.833625
std 2309.54541 17.118774 8.935263 4.318732 1.693373 3.584559 6.155453 0.384093 1.060964 16.003155 ... 1.252352 10.430447 0.785670 1.392224 2.085154 1.355986 0.407626 0.821037 0.723270 0.372440
min 3799.00000 -8.000000 2.900000 0.800000 0.300000 0.800000 21.300000 -1.100000 -3.100000 -38.500000 ... 0.000000 -13.300000 0.000000 0.200000 0.300000 0.000000 0.000000 -17.900000 0.100000 0.000000
25% 5798.75000 51.000000 12.000000 4.100000 1.600000 3.600000 40.400000 0.000000 0.100000 8.400000 ... 1.000000 65.000000 0.500000 1.100000 1.700000 0.700000 0.300000 0.100000 0.700000 1.000000
50% 7798.50000 63.000000 16.800000 6.300000 2.400000 5.400000 44.400000 0.300000 0.800000 19.500000 ... 1.700000 71.400000 0.900000 1.900000 2.800000 1.300000 0.600000 0.200000 1.100000 1.000000
75% 9798.25000 74.000000 23.500000 9.500000 3.700000 8.100000 48.700000 0.500000 1.500000 30.600000 ... 2.600000 77.500000 1.500000 2.900000 4.300000 2.200000 0.900000 0.400000 1.600000 1.000000
max 11798.00000 123.000000 73.800000 34.200000 13.100000 28.900000 67.200000 1.700000 4.700000 82.100000 ... 11.100000 168.900000 5.500000 11.000000 15.900000 12.800000 3.600000 18.900000 5.300000 1.000000

8 rows × 21 columns

In [15]:
#Get the descriptive stats of test dataframe
df_test.describe()
Out[15]:
Id GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% FTM FTA FT% OREB DREB REB AST STL BLK TOV
count 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000 3799.000000
mean 1899.000000 62.853909 18.650224 7.328034 2.835404 6.302580 44.599079 0.255962 0.796920 19.234746 1.399842 1.953567 71.612924 1.096025 2.179495 3.275783 1.636483 0.653593 0.257726 1.257910
std 1096.821164 17.151740 8.727259 4.294724 1.688427 3.579221 6.040168 0.380987 1.052862 15.968989 0.926140 1.250376 10.457336 0.785678 1.371935 2.070646 1.335496 0.410573 0.639660 0.712449
min 0.000000 6.000000 3.700000 0.700000 0.300000 0.800000 25.100000 -1.000000 -2.700000 -38.000000 0.000000 0.000000 23.700000 0.000000 0.200000 0.300000 0.000000 0.000000 -7.100000 0.100000
25% 949.500000 51.000000 12.200000 4.200000 1.600000 3.700000 40.500000 0.000000 0.100000 8.500000 0.700000 1.000000 65.000000 0.500000 1.200000 1.800000 0.600000 0.400000 0.100000 0.700000
50% 1899.000000 63.000000 17.000000 6.400000 2.500000 5.500000 44.600000 0.300000 0.800000 19.400000 1.200000 1.700000 71.500000 0.900000 1.900000 2.800000 1.300000 0.600000 0.200000 1.100000
75% 2848.500000 74.000000 23.300000 9.400000 3.700000 8.100000 48.500000 0.500000 1.500000 30.250000 1.900000 2.600000 78.000000 1.500000 2.900000 4.300000 2.300000 0.900000 0.400000 1.600000
max 3798.000000 126.000000 68.000000 33.000000 13.400000 26.200000 74.600000 1.600000 4.300000 73.800000 7.800000 9.800000 127.100000 6.900000 12.000000 18.500000 9.000000 2.700000 14.800000 5.200000
In [16]:
#Confirm coefficients of train dataframe
df_train_corr = df_train.corr()
print(df_train_corr)
                   Id        GP       MIN       PTS       FGM       FGA  \
Id           1.000000  0.003940 -0.002747  0.003048  0.001812  0.000376   
GP           0.003940  1.000000  0.608090  0.578344  0.577697  0.553374   
MIN         -0.002747  0.608090  1.000000  0.904840  0.895957  0.895085   
PTS          0.003048  0.578344  0.904840  1.000000  0.989208  0.973297   
FGM          0.001812  0.577697  0.895957  0.989208  1.000000  0.972670   
FGA          0.000376  0.553374  0.895085  0.973297  0.972670  1.000000   
FG%          0.010359  0.278510  0.234919  0.301991  0.343140  0.140998   
3P Made     -0.009092  0.123231  0.358597  0.337786  0.288906  0.373362   
3PA         -0.005847  0.119146  0.372454  0.349452  0.299300  0.399025   
3P%         -0.001710  0.045434  0.153846  0.156044  0.123230  0.203040   
FTM          0.008232  0.524276  0.767647  0.864463  0.810273  0.787989   
FTA          0.008845  0.519154  0.753660  0.842903  0.798366  0.761174   
FT%         -0.003565  0.155667  0.212366  0.257157  0.208038  0.263515   
OREB        -0.004322  0.402654  0.542782  0.539289  0.565086  0.464325   
DREB        -0.004517  0.492070  0.770128  0.690687  0.699456  0.635072   
REB         -0.004924  0.484485  0.728842  0.673621  0.689229  0.607930   
AST          0.008069  0.402126  0.639013  0.573893  0.550682  0.619597   
STL          0.013752  0.473409  0.759933  0.678514  0.663419  0.689391   
BLK         -0.005705  0.187269  0.260428  0.232923  0.245199  0.191895   
TOV          0.013185  0.537821  0.792059  0.816414  0.799119  0.816810   
TARGET_5Yrs  0.006645  0.242108  0.182203  0.182004  0.183568  0.162389   

                  FG%   3P Made       3PA       3P%  ...       FTA       FT%  \
Id           0.010359 -0.009092 -0.005847 -0.001710  ...  0.008845 -0.003565   
GP           0.278510  0.123231  0.119146  0.045434  ...  0.519154  0.155667   
MIN          0.234919  0.358597  0.372454  0.153846  ...  0.753660  0.212366   
PTS          0.301991  0.337786  0.349452  0.156044  ...  0.842903  0.257157   
FGM          0.343140  0.288906  0.299300  0.123230  ...  0.798366  0.208038   
FGA          0.140998  0.373362  0.399025  0.203040  ...  0.761174  0.263515   
FG%          1.000000 -0.269994 -0.330586 -0.312446  ...  0.335512 -0.187200   
3P Made     -0.269994  1.000000  0.979340  0.585280  ...  0.088516  0.320845   
3PA         -0.330586  0.979340  1.000000  0.576168  ...  0.101186  0.335011   
3P%         -0.312446  0.585280  0.576168  1.000000  ... -0.054517  0.339894   
FTM          0.279252  0.149567  0.165116  0.013036  ...  0.975611  0.274606   
FTA          0.335512  0.088516  0.101186 -0.054517  ...  1.000000  0.098091   
FT%         -0.187200  0.320845  0.335011  0.339894  ...  0.098091  1.000000   
OREB         0.539317 -0.200575 -0.213506 -0.297108  ...  0.616218 -0.190785   
DREB         0.429142  0.076694  0.069782 -0.105370  ...  0.658778 -0.042316   
REB          0.490902 -0.014098 -0.023208 -0.181554  ...  0.678932 -0.098987   
AST         -0.132006  0.375967  0.409646  0.291623  ...  0.436225  0.306042   
STL          0.071364  0.305817  0.337407  0.195759  ...  0.572355  0.196138   
BLK          0.288155 -0.094750 -0.101143 -0.169376  ...  0.261547 -0.150871   
TOV          0.128622  0.231413  0.255422  0.095169  ...  0.762837  0.179152   
TARGET_5Yrs  0.158858  0.015016  0.003463 -0.011977  ...  0.176525  0.039429   

                 OREB      DREB       REB       AST       STL       BLK  \
Id          -0.004322 -0.004517 -0.004924  0.008069  0.013752 -0.005705   
GP           0.402654  0.492070  0.484485  0.402126  0.473409  0.187269   
MIN          0.542782  0.770128  0.728842  0.639013  0.759933  0.260428   
PTS          0.539289  0.690687  0.673621  0.573893  0.678514  0.232923   
FGM          0.565086  0.699456  0.689229  0.550682  0.663419  0.245199   
FGA          0.464325  0.635072  0.607930  0.619597  0.689391  0.191895   
FG%          0.539317  0.429142  0.490902 -0.132006  0.071364  0.288155   
3P Made     -0.200575  0.076694 -0.014098  0.375967  0.305817 -0.094750   
3PA         -0.213506  0.069782 -0.023208  0.409646  0.337407 -0.101143   
3P%         -0.297108 -0.105370 -0.181554  0.291623  0.195759 -0.169376   
FTM          0.544742  0.619711  0.625154  0.486761  0.591616  0.221537   
FTA          0.616218  0.658778  0.678932  0.436225  0.572355  0.261547   
FT%         -0.190785 -0.042316 -0.098987  0.306042  0.196138 -0.150871   
OREB         1.000000  0.778602  0.897889 -0.003037  0.290161  0.430952   
DREB         0.778602  1.000000  0.970105  0.223116  0.437507  0.441530   
REB          0.897889  0.970105  1.000000  0.149737  0.408584  0.458246   
AST         -0.003037  0.223116  0.149737  1.000000  0.737032 -0.084823   
STL          0.290161  0.437507  0.408584  0.737032  1.000000  0.095003   
BLK          0.430952  0.441530  0.458246 -0.084823  0.095003  1.000000   
TOV          0.378683  0.548209  0.511051  0.738581  0.707753  0.144671   
TARGET_5Yrs  0.172604  0.168065  0.175627  0.096771  0.128381  0.098642   

                  TOV  TARGET_5Yrs  
Id           0.013185     0.006645  
GP           0.537821     0.242108  
MIN          0.792059     0.182203  
PTS          0.816414     0.182004  
FGM          0.799119     0.183568  
FGA          0.816810     0.162389  
FG%          0.128622     0.158858  
3P Made      0.231413     0.015016  
3PA          0.255422     0.003463  
3P%          0.095169    -0.011977  
FTM          0.771939     0.173913  
FTA          0.762837     0.176525  
FT%          0.179152     0.039429  
OREB         0.378683     0.172604  
DREB         0.548209     0.168065  
REB          0.511051     0.175627  
AST          0.738581     0.096771  
STL          0.707753     0.128381  
BLK          0.144671     0.098642  
TOV          1.000000     0.151240  
TARGET_5Yrs  0.151240     1.000000  

[21 rows x 21 columns]
In [70]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.pairplot(df_train, hue='TARGET_5Yrs')
plt.show()
In [67]:
import matplotlib.pyplot as plt
import seaborn as sns

#Drop the ID column
df_train_tmp = df_train.drop(columns=['Id'], axis=1)
cols = df_train_tmp.columns
rows = cols
corr = df_train_tmp.corr()

# Click event definition start
def onclick(event):
    x = event.xdata
    y = event.ydata
    axes = event.inaxes
    if x == None or y == None:
        return
    if axes != ax[0]:
        return

    col = cols[int(x)]
    row = rows[int(y)]

    xwid = df[col].max() - df[col].min()
    ywid = df[row].max() - df[row].min()

    # update the plot on the right side
    ax[1].set_title('Correlation coefficient ' + col + ' - ' + row + ': ' \
                    + str(round(corr.loc[row, col], 3)))
    ax[1].set_xlabel(col)
    ax[1].set_ylabel(row)
    ln.set_data(df[col], df[row])
    ax[1].set_xlim(df[col].min() - xwid * 0.1, df[col].max() + xwid * 0.1)
    ax[1].set_ylim(df[row].min() - ywid * 0.1, df[row].max() + ywid * 0.1)
    plt.draw()
# Click event definition end

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(corr, ax=ax[0], vmin=-1, vmax=1, annot=True, fmt="3.2f", annot_kws={"size": 5})
ln, = ax[1].plot([np.nan], [np.nan], 'o') #Plot event
ax[0].set_aspect('equal')
#Click event
fig.canvas.mpl_connect('button_press_event', onclick)
#Display the heatmap
plt.show()

2. Data Preparation (pre-process)¶

In [18]:
#Create a copy of dataframe
df_train_cleaned = df_train.copy()
df_test_cleaned = df_test.copy()
In [19]:
#Extract the column "TARGET_5Yrs" into a target variable
target = df_train_cleaned.pop('TARGET_5Yrs')
In [47]:
#Apply the StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train_cleaned = scaler.fit_transform(df_train_cleaned)
df_test_cleaned = scaler.fit_transform(df_test_cleaned)
In [73]:
#Save the scaler into the folder "models" and name it
from joblib import dump
dump(scaler, '../models/scaler.joblib')
Out[73]:
['../models/scaler.joblib']
In [49]:
#Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split
In [50]:
#Split randomly the train dataframe with random_state=8 into 2 different sets: training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(df_train_cleaned, target, test_size=0.2, random_state=8)
In [51]:
#Set the test dataframe into X_test variable
X_test = df_test_cleaned.copy()
In [52]:
#Save the different sets into the folder ~data/processed
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/X_test',  X_test)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)

3. Get Baseline Model¶

In [53]:
#Calculate the average of the target variable for the training set and save it into a variable called `y_mean`
y_mean = y_train.mean()
In [54]:
#Create a numpy array called `y_base` of dimensions (len(y_train), 1) filled with this value
y_base = np.full((len(y_train), 1), y_mean)
In [55]:
#Import the MSE and MAE metrics from sklearn
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
In [56]:
#Display the RMSE and MAE scores of the baseline on the training set
print(mse(y_train, y_base, squared=False))
print(mae(y_train, y_base))
0.3736996987472026
0.27930292968750003

4. Apply the Logistic Regression with Sklearn¶

In [57]:
#Import logistic regression from sklearn linear model
from sklearn.linear_model import LogisticRegression
In [58]:
#Create an instance of logistic regression
model = LogisticRegression(penalty = 'l2', #use L2 regularisation
                           dual = False, #dual problem
                           tol = 0.0001,
                           C = 1.0,
                           fit_intercept = True,
                           intercept_scaling = 1,
                           class_weight = None,
                           random_state = None,
                           solver = 'lbfgs',
                           max_iter = 500,
                           multi_class = 'auto',
                           verbose = 0,
                           warm_start = False,
                           n_jobs = None,
                           l1_ratio = None
                          )
In [59]:
#Train the model
model.fit(X_train, y_train)
Out[59]:
LogisticRegression(max_iter=500)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=500)
In [71]:
#Dump the model
from joblib import dump
dump(model,  '../models/logreg_sklearn.joblib')
Out[71]:
['../models/logreg_sklearn.joblib']

5. Evaluation¶

In [61]:
#Caluculate the R suquare
print("Train data accuracy: {:.3f}".format(model.score(X_train, y_train)))
print("Validation data accuracy: {:.3f}".format(model.score(X_val, y_val)))
Train data accuracy: 0.835
Validation data accuracy: 0.835
In [62]:
#Caluculate the coefficient and intercept
print("coefficient = ", model.coef_)
print("intercept = ", model.intercept_)
coefficient =  [[ 0.04127634  0.49660904  0.0764936  -0.14406699  0.18976476 -0.10898981
   0.16099272  0.66269107 -0.64815231  0.02244343 -0.24519065  0.3460834
   0.12228272  0.54551016  0.28446651 -0.61289122  0.19928254 -0.04618178
   0.1051189  -0.07319577]]
intercept =  [1.81202169]
In [63]:
#Save the predictions from this model for training and validation sets into 2 variables
y_train_preds = model.predict(X_train)
y_val_preds = model.predict(X_val)
In [64]:
#Display the RMSE and MAE scores of this model on the training set
print(mse(y_train, y_train_preds, squared=False))
print(mae(y_train, y_train_preds))
0.406201920231798
0.165
In [65]:
#Display the RMSE and MAE scores of this model on the validation set
print(mse(y_val, y_val_preds, squared=False))
print(mae(y_val, y_val_preds))
0.406201920231798
0.165